This are a few customized functions that will be used during this analysis.
#function to show color palettes
my_color_pal <- function(colours, names_or_hex, borders = NULL, cex_label = 1, ncol = NULL) {
# get hexs
colours_hex <- unname(colours)
# get names
names_colours <- names(colours)
# functions internal to scales::show_col()
n <- length(colours)
ncol <- ceiling(sqrt(n))
nrow <- ceiling(n/ncol)
colours <- c(colours, rep(NA, nrow * ncol - length(colours)))
colours <- matrix(colours, ncol = ncol, byrow = TRUE)
old <- par(pty = "s", mar = c(0, 0, 0, 0))
on.exit(par(old))
size <- max(dim(colours))
plot(c(0, size), c(0, -size), type = "n", xlab = "", ylab = "", axes = FALSE)
rect(col(colours) - 1, -row(colours) + 1, col(colours), -row(colours),
col = colours, border = borders)
# add conditional plotting of hex codes or names
if (names_or_hex == "hex") {
text(col(colours) - 0.5, -row(colours) + 0.5, colours_hex)
} else if(names_or_hex == "names"){
text(col(colours) - 0.5, -row(colours) + 0.5, names_colours)
} else {
text(col(colours) - 0.5, -row(colours) + 0.5, paste0(names_colours,'\n',colours_hex))
}
}
#function to rename counts.df to make some graphs
label_cond <- function(x) { samples.info$condition[x] }
# relabel_id <- function(df, col) {
# if (exists("sample.info") && all(samples.info$Sample %in% df$col)) {
# matching_row_names <- samples.info$Sample[samples.info$Sample %in% df$col]
# df$col[df$col %in% matching_row_names] <- names(samples.info)[matching_row_names]
# }
#
# return(df)
# }
Keep in mind that you need a folder called images and another one called stats. In this folder we will save all the results from our analysis. We define them to use them in all of our analysis.
setwd(file.path(wd))
imgdir <- "./images/"
statdir <- "./stats/"
Load a sample info file with all the information from the ATAC seq.
library(readr)
samples.info <- read_delim("./info_embryo.csv",
delim = ";",
escape_double = FALSE,
trim_ws = TRUE)
head(samples.info,2)
## # A tibble: 2 × 4
## stage Name Replicate Sample
## <chr> <chr> <dbl> <chr>
## 1 emb4 embryo_stage4_rep1 1 e4_1_001
## 2 emb4 embryo_stage4_rep2 2 e4_2_001
We make a matrix with the consensus peaks counts. We have the information speared in different files (one per stage) so we manege them in one matrix.
# Specify the folder path
counts_path <- "./counts"
counts.file <- paste0(counts_path, "/merged_embryos.counts")
counts <- as.matrix(read.csv(counts.file,row.names=1,sep = "\t"))
str(counts)
## int [1:49253, 1:12] 240 505 304 680 52 126 37 23 209 9 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:49253] "peak1" "peak2" "peak3" "peak4" ...
## ..$ : chr [1:12] "e10_1_001" "e10_2_001" "e12_1_001" "e12_2_001" ...
head(counts[,c(1, 2, 3)],2)
## e10_1_001 e10_2_001 e12_1_001
## peak1 240 253 236
## peak2 505 390 316
We import some peak information (ampliar depsres la taula…)
peaks_width <- read_table("pfla_all_peaks_width.txt",
col_names = c("peak_id", "width"),
col_types = cols(X2 = col_integer()))
peaks_annot <- read_delim("peaks_to_genes_annotation.tbl",
delim = "\t",
escape_double = FALSE,
col_names = c("peak_id", "cdip_gene", "uniprot_id", "FBgn_id", "FB_symbol", "FB_name"),
trim_ws = TRUE)
peaks_info <- merge(peaks_annot,
peaks_width,
by.x="peak_id")
This group of data can be used to define some variables of your experiment and some aesthetic parameter for your graphs.
Set the statistical parameters for the analysis:
MIN.FC <- 0.5 # Define Fold Change (FC) threshold
MIN.PV <- 0.05 # Define p-value (PV) threshold
cat(paste0("The Fold change threshold this analysis is ", MIN.FC,
"\nThe máximum p-value considired sigificative in this analysis is ", MIN.PV))
## The Fold change threshold this analysis is 0.5
## The máximum p-value considired sigificative in this analysis is 0.05
We establish the comparisons we wish to study. We define our
conditions to study as condition
condition_factors <- factor(samples.info$stage,
levels=c( "emb4","emb6","emb8","emb10", "emb12", "emb14" ))
samples.info$condition <- condition_factors
cat(paste0('Condition: ', levels(condition_factors), '\n'))
## Condition: emb4
## Condition: emb6
## Condition: emb8
## Condition: emb10
## Condition: emb12
## Condition: emb14
Set some color parameters to have coherent graphs:
# IMPORTANT: The name of your conditions must be the same defined in samples.info
condition.colors <- c("emb4"="#F4AE3E",
"emb6"="#8EA4D2",
"emb8"="#4A314D",
"emb10"="#F59CA9",
"emb12"="#32936F",
"emb14"="#2B3A67")
my_color_pal(colours = condition.colors,
names_or_hex = "other", #names=condition hex=color_code other=both
borders=NA)
# Counts exploration
We explore the counts data without any transformation
Initial exploration of the raw data. In this section we will explore if there are some inconsistencies in our war data and it it’s necessary to eliminate some replicas.
We filter the rows with no counts or only 1 count. With this filter we eliminate some useless data and speed up the downstream processes.
keep <- rowSums(counts) > 2
counts <- counts[keep,]
#merge of the raw counts with the sample information. We also transform counts to a log2 scale for better visualization
dt.exp_raw <- stack(as.data.frame(counts))
dt.exp_raw$values <- log2(dt.exp_raw$values + 1e-3)
dt.exp_raw$gp <- label_cond(dt.exp_raw$ind)
colnames(dt.exp_raw) <- c('log2exp','sampleID','colorID')
We plot the variation of raw counts across our samples
boxplot.expr_raw <- ggplot(dt.exp_raw, aes(x=sampleID, y=log2exp, fill=colorID)) +
geom_violin(show.legend = FALSE) +
theme_light() +
theme(plot.title = element_text(hjust=0.5),
axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values=condition.colors) +
xlab("Samples") +
ylab("log2 raw counts") +
ggtitle("Boxplot of raw counts")
print(boxplot.expr_raw)
It’s also important to know the library size differences
libsize <- as.data.frame(colSums(counts))
colnames(libsize) <- "LibSize"
libsize$sampleID <- as.factor(row.names(libsize))
libsize$colorID <- label_cond(libsize$sampleID)
libsize_plot <- ggplot(libsize, aes(x=sampleID, y=LibSize, fill=colorID)) +
geom_bar(stat="identity") +
scale_fill_manual(values=condition.colors) +
labs(title="Library size by sample") +
ylab("Library Size") +
xlab("Sample") +
coord_flip() + theme_light() +
theme(legend.position="none")
print(libsize_plot)
Our first normalization is by peak width. We will divide the counts by the amplitude of the peak
width_values <- peaks_info$width
names(width_values) <- peaks_info$peak_id
counts <- sweep(counts, 2, width_values, "/")
From our matrix we create an object called dge
containing information from our counts and sample information.
dge <- DGEList(counts=counts,
samples=samples.info,
group=samples.info$condition)
str(dge)
## Formal class 'DGEList' [package "edgeR"] with 1 slot
## ..@ .Data:List of 2
## .. ..$ : num [1:49253, 1:12] 0.787 2.087 2.171 1.071 0.109 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:49253] "peak1" "peak2" "peak3" "peak4" ...
## .. .. .. ..$ : chr [1:12] "e10_1_001" "e10_2_001" "e12_1_001" "e12_2_001" ...
## .. ..$ :'data.frame': 12 obs. of 8 variables:
## .. .. ..$ group : Factor w/ 6 levels "emb4","emb6",..: 1 1 2 2 3 3 4 4 5 5 ...
## .. .. ..$ lib.size : num [1:12] 17782 16938 14637 17050 18853 ...
## .. .. ..$ norm.factors: num [1:12] 1 1 1 1 1 1 1 1 1 1 ...
## .. .. ..$ stage : chr [1:12] "emb4" "emb4" "emb6" "emb6" ...
## .. .. ..$ Name : chr [1:12] "embryo_stage4_rep1" "embryo_stage4_rep2" "embryo_stage6_rep1" "embryo_stage6_rep6" ...
## .. .. ..$ Replicate : num [1:12] 1 2 1 2 1 2 1 2 1 2 ...
## .. .. ..$ Sample : chr [1:12] "e4_1_001" "e4_2_001" "e6_1_001" "e6_2_001" ...
## .. .. ..$ condition : Factor w/ 6 levels "emb4","emb6",..: 1 1 2 2 3 3 4 4 5 5 ...
## ..$ names: chr [1:2] "counts" "samples"
We most also define the design of our assay. In limma we define a matrix-like design that will indicate which are our hypothesis contrasts. This is one of the key steeps in the limma analysis.
design.condition <- model.matrix( ~0+ samples.info$condition, dge )
colnames(design.condition) <- gsub("samples.info[$]condition", "", colnames(design.condition))
colnames(design.condition) <- gsub("samples.info[$]sample", "", colnames(design.condition))
names(attr(design.condition,"contrasts")) <- c("condition")
rownames(design.condition) <-samples.info$Sample
design.condition
## emb4 emb6 emb8 emb10 emb12 emb14
## e4_1_001 1 0 0 0 0 0
## e4_2_001 1 0 0 0 0 0
## e6_1_001 0 1 0 0 0 0
## e6_2_001 0 1 0 0 0 0
## e8_1_001 0 0 1 0 0 0
## e8_2_001 0 0 1 0 0 0
## e10_1_001 0 0 0 1 0 0
## e10_2_001 0 0 0 1 0 0
## e12_1_001 0 0 0 0 1 0
## e12_2_001 0 0 0 0 1 0
## e14_1_001 0 0 0 0 0 1
## e14_2_001 0 0 0 0 0 1
## attr(,"assign")
## [1] 1 1 1 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$condition
## [1] "contr.treatment"
If the sequencing depth is reasonably consistent across the samples, then the simplest and most robust approach is to use limma-trend. This approach will usually work well if the ratio of the largest library size to the smallest is not more than about 3-fold. We can check this in [Data exploration].
print(paste0('The largest library have: ',max(libsize$LibSize),' counts'))
## [1] "The largest library have: 7615361 counts"
print(paste0('The smallest library have: ',min(libsize$LibSize),' counts'))
## [1] "The smallest library have: 2703453 counts"
print(paste0("That's ", round(max(libsize$LibSize)/min(libsize$LibSize),1), " times bigger"))
## [1] "That's 2.8 times bigger"
In the limma-trend, the counts are normalized and converted to logCPM values.
dge <- calcNormFactors(dge, method="TMM")
logCPM <- cpm(dge, log=TRUE, prior.count=3)
After normalization we can observe the counts distribution varied and also how the normalized libraries are corrected.
#merge of the logCPM with the sample information.
dt.exp <- stack(as.data.frame(logCPM))
dt.exp$gp <- label_cond(dt.exp$ind)
colnames(dt.exp) <- c('log2exp','sampleID','colorID')
boxplot.expr_raw <- ggplot(dt.exp, aes(x=sampleID, y=log2exp, fill=colorID)) +
geom_violin(show.legend = FALSE) +
theme_light() +
theme(plot.title = element_text(hjust=0.5),
axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values=condition.colors) +
xlab("Samples") +
ylab("log2 CPM") +
ggtitle("Boxplot of normalized counts")
print(boxplot.expr_raw)
It’s also important to know the library size differences
libsize <- as.data.frame(colSums(logCPM))
colnames(libsize) <- "LibSize"
libsize$sampleID <- as.factor(row.names(libsize))
libsize$colorID <- label_cond(libsize$sampleID)
libsize_plot <- ggplot(libsize, aes(x=sampleID, y=LibSize, fill=colorID)) +
geom_bar(stat="identity") +
scale_fill_manual(values=condition.colors) +
labs(title="Library size by sample after normalitzation") +
ylab("Library Size") +
xlab("Sample") +
coord_flip() + theme_light() +
theme(legend.position="none")
print(libsize_plot)
We observe the PCA distribution of our samples
pcaData <- prcomp(counts, scale=T)
pcaData.df <- as.data.frame(pcaData$rotation)
percentVar <- round(summary(pcaData)$importance[2,]*100,2)
pcaData.df$sampleID <- as.factor(row.names(pcaData.df))
pcaData.df$colorID <- label_cond(pcaData.df$sampleID)
pca.expr_raw <- ggplot(pcaData.df, aes(x=PC1,y=PC2, color=colorID, label=sampleID)) +
geom_point(size=3) +
scale_color_manual(values=condition.colors) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
geom_label_repel(aes(label = sampleID),
box.padding = 0.35,
point.padding = 0.5,
segment.color = 'grey50') +
theme_light() +
theme(legend.position="none")
print(pca.expr_raw)
We can also see the sample distribution in a dendrogram.
dist <- as.dist(1-cor(counts, method="spearman"))
distclust <- hclust(dist)
dendrogram <- as.dendrogram(distclust, hang=0.1)
labels_colors(dendrogram) <- condition.colors[samples.info$condition][order.dendrogram(dendrogram)]
dend_raw <- function(){plot(dendrogram,
horiz = FALSE,
yaxt='n', ann=FALSE)}
dend_raw()
Limma is a package for the analysis of gene expression data arising from microarray or RNA-seq technologies. A core capability is the use of linear models to assess differential expression in the context of multi-factor designed experiments. In the limma approach to RNA-seq, read counts are converted to log2-counts-per-million (logCPM) and the mean-variance relationship is modeled either with precision weights or with an empirical Bayes prior trend.
We fit our data to limma linear model
contrasts= c(paste("emb4","emb6", sep="-"),
paste("emb6","emb8", sep="-"),
paste("emb8","emb10", sep="-"),
paste("emb10","emb12", sep="-"),
paste("emb12","emb14", sep="-"))
contrasts.matrix <- makeContrasts(contrasts=contrasts, levels=design.condition)
fitt <- lmFit(logCPM, design.condition)
fitt <- contrasts.fit(fitt, contrasts.matrix)
fitt <- eBayes(fitt, trend=TRUE)
summary(decideTests(fitt))
## emb4-emb6 emb6-emb8 emb8-emb10 emb10-emb12 emb12-emb14
## Down 0 0 1290 2 1
## NotSig 49252 49253 46918 49227 49250
## Up 1 0 1045 24 2
We plot how many new oppened peaks we have per stage. We define as new open the unregulated peaks and new closed the dowregulated peaks.
de_df <- data.frame(t(summary(decideTests(fitt))))
colnames(de_df) <- c("stg_trans", "exp", "n")
filtered_df <- de_df[de_df$exp != "NotSig", ]
ggplot(filtered_df, aes(x = stg_trans, y = n, color = exp, group = exp)) +
geom_line() +
geom_point() +
labs(title = "Line Plot of n vs stg_trans",
x = "Satge transition",
y = "Number of differentally regulated peaks",
color = "Chromatin state") +
scale_color_manual(values = c("Down" = "#67B3E0", "Up" = "#FE938C"),
labels = c("Down" = "Closed", "Up" = "Opened")) +
theme_minimal()
make_toptable_arraysonly <- function (thyset,de.efit,expr.raw,thycontrast,
thylbl="",thyfun="",max.hmp.ROWS=100) {
cat("# Running on ",thyset,".limma.",thylbl,thycontrast,"\n",sep="");
top.res <- topTable(de.efit, coef=thycontrast, adjust="fdr", sort.by="B", number=Inf)
top.res$contig.ID <- rownames(top.res)
top.res <- left_join(top.res,annotations_table,by="contig.ID")
top.res$rank <- ifelse(is.na(top.res$P.Value) | is.na(top.res$logFC), NA,
# ifelse(is.na(top.res$logFC), -1e160, -1e150 * top.res$logFC),
-log10(top.res$P.Value)*top.res$logFC *
ifelse(top.res$P.Value <= MIN.PV & abs(top.res$logFC) >= MIN.FC,
ifelse(top.res$adj.P.Val <= MIN.PV, 1e4, 1e2), 1));
top.res$DGE.pval <- as.factor(
ifelse(top.res$P.Value <= MIN.PV & abs(top.res$logFC) >= MIN.FC,
ifelse(top.res$logFC >= MIN.FC, "up", "down"), "no-sig") );
top.res$DGE.padj <- as.factor(
ifelse(top.res$adj.P.Val <= MIN.PV & abs(top.res$logFC) >= MIN.FC,
ifelse(top.res$logFC >= MIN.FC, "UP", "DOWN"), "no-sig") );
rownames(top.res) <- top.res$contig.ID
contrast.Signif <- c( top.res$DGE.padj == "UP" | top.res$DGE.padj == "DOWN" )
contrast.Signif.ids <- top.res$contig.ID[contrast.Signif]
contrast.signif <- c( top.res$DGE.pval == "up" | top.res$DGE.pval == "down" )
contrast.signif.ids <- top.res$contig.ID[contrast.signif]
contrast.Tbl <- cbind(top.res[ contrast.Signif.ids, ],
expr.raw[ contrast.Signif.ids, ])
contrast.tbl <- cbind(top.res[ contrast.signif.ids, ],
expr.raw[ contrast.signif.ids, ])
cRows <- nrow(contrast.Tbl)
crows <- nrow(contrast.tbl)
cmax.Rows <- min(cRows,max.hmp.ROWS) #100)
cmax.rows <- min(crows,max.hmp.ROWS) #100)
cat("# Heatmap for ",thyset,".limma.",thylbl,thycontrast,
" nrows=",cRows,"/",cmax.Rows," : ",crows,"/",cmax.rows,"\n",sep="");
contrast.Tmp <- (contrast.Tbl[with(contrast.Tbl,
order(abs(contrast.Tbl$rank), # logFC),
decreasing=TRUE )),]
)[1:cmax.Rows,]
contrast.tmp <- (contrast.tbl[with(contrast.tbl,
order(abs(contrast.tbl$rank), # logFC),
decreasing=TRUE )),]
)[1:cmax.rows,]
cond.select <- c (sub("_vs_.*", "", thycontrast), sub(".*_vs_", "", thycontrast))
barcode.select <- samples.info[samples.info$condition %in% cond.select,]$barcode
pHM1 <-pheatmap::pheatmap(contrast.Tmp[, barcode.select],
labels_col = samples.info[samples.info$barcode %in% barcode.select,]$shrtlbl,
labels_row = contrast.Tmp[, "contig.ID"],
angle_col=45,
main = paste0("Most varying ",cmax.Rows," of ",cRows," significant markers\nfor ",
thyset,".limma.",thylbl,thycontrast," (",thyfun,")"))
png(file=paste0(imgdir,"limma_heatmaps_",thyfun,"expression_significantgenesonly.",thyset,".limma.",thylbl,".",thycontrast,".png"),
res=600, height=max(4,(20 * cmax.Rows/100)), width=10, unit="in", pointsize=10);
grid::grid.newpage()
grid::grid.draw(pHM1$gtable)
dev.off();
pHM2 <- pheatmap::pheatmap(contrast.Tmp[, barcode.select],
labels_col = samples.info[samples.info$barcode %in% barcode.select,]$shrtlbl,
labels_row = contrast.Tmp[, "Human.homolog"],
angle_col=45,
main = paste0("Most varying ",cmax.Rows," of ",cRows," significant markers\nfor ",
thyset,".limma.",thylbl,thycontrast," (",thyfun,")"))
png(file=paste0(imgdir,"limma_heatmaps_with_homologs_",thyfun,"expression_significantgenesonly.",thyset,".limma.",thylbl,".",thycontrast,".png"),
res=600, height=max(4,(20 * cmax.Rows/100)), width=10, unit="in", pointsize=10);
grid::grid.newpage()
grid::grid.draw(pHM2$gtable)
dev.off();
phm1 <- pheatmap::pheatmap(contrast.tmp[, barcode.select],
labels_col = samples.info[samples.info$barcode %in% barcode.select,]$shrtlbl,
labels_row = contrast.tmp[, "contig.ID"],
angle_col=45,
main = paste0("Most varying ",cmax.rows," of ",cRows," markers\n for ",
thyset,".limma.",thylbl,thycontrast," (",thyfun,")"))
png(file=paste0(imgdir,"limma_heatmaps_",thyfun,"expression_topmostvaryinggenes.",thyset,".limma.",thylbl,".",thycontrast,".png"),
res=600, height=max(4,(20 * cmax.rows/100)), width=10, unit="in", pointsize=10);
grid::grid.newpage()
grid::grid.draw(phm1$gtable)
dev.off();
phm2 <-pheatmap::pheatmap(contrast.tmp[, barcode.select],
labels_col = samples.info[samples.info$barcode %in% barcode.select,]$shrtlbl,
labels_row = contrast.tmp[, "Human.homolog"],
angle_col=45,
main = paste0("Most varying ",cmax.rows," of ",cRows," markers\n for ",
thyset,".limma.",thylbl,thycontrast," (",thyfun,")"))
png(file=paste0(imgdir,"limma_heatmaps_with_homologs_",thyfun,"expression_topmostvaryinggenes.",thyset,".limma.",thylbl,".",thycontrast,".png"),
res=600, height=max(4,(20 * cmax.rows/100)), width=10, unit="in", pointsize=10);
grid::grid.newpage()
grid::grid.draw(phm2$gtable)
dev.off();
}
We generate a table with the results:
make_top_table <- function(fit_model, contrast) {
top.res.t <- topTable(fit_model, coef=contrast,
adjust.method = "fdr", sort.by="B",
number=Inf)
top.res.t$rank <- ifelse(is.na(top.res.t$P.Value) | is.na(top.res.t$logFC), NA,
-log10(top.res.t$P.Value)*top.res.t$logFC *
ifelse(top.res.t$P.Value <= MIN.PV,
ifelse(top.res.t$adj.P.Val <= MIN.PV, 1e4, 1e2), 1));
top.res.t$DGE.pval <- as.factor(
ifelse(top.res.t$P.Value <= MIN.PV,
ifelse(top.res.t$logFC >= MIN.FC, "up", "down"), "no-sig") );
top.res.t$DGE.padj <- as.factor(
ifelse(top.res.t$adj.P.Val <= MIN.PV,
ifelse(top.res.t$logFC >= MIN.FC, "UP", "DOWN"), "no-sig") );
UPDN <- c(nrow(top.res.t[ top.res.t$DGE.padj == "UP", ]),
nrow(top.res.t[ top.res.t$DGE.padj == "DOWN", ]))
updn <- c(nrow(top.res.t[ top.res.t$DGE.pval == "up", ]),
nrow(top.res.t[ top.res.t$DGE.pval == "down", ]))
top.res.t
}
top_table_list <- lapply(contrasts,
function(x) make_top_table(fitt, x))
names(top_table_list) <- contrasts
We add the gene anotation information to the table
top_table_list_annot <- lapply(contrasts,
function(x) {
df = tibble::rownames_to_column(top_table_list[[x]], "peak_id")
merge(df, peaks_annot, by.x="peak_id")
})
names(top_table_list_annot) <- contrasts
We plot the results in a Volcano plot:
my_volcanoplot <- function(top.res.table, contrast_name) {
#define some values for the caption
UPDN <- c(nrow(top.res.table[ top.res.table$DGE.padj == "UP", ]),
nrow(top.res.table[ top.res.table$DGE.padj == "DOWN", ]))
EnhancedVolcano(data.frame(top.res.table), x = 'logFC', y = 'adj.P.Val',
lab = rownames(top.res.table),
xlab = bquote(~Log[2]~ 'fold change'),
ylab = bquote(~-Log[10]~adjusted~italic(P)),
xlim = c(min(top.res.table$logFC, na.rm = TRUE) - 0.5, max(top.res.table$logFC, na.rm = TRUE) + 0.5),
ylim = c(0, max(-log10(top.res.table$adj.P.Val), na.rm = TRUE) + 1),
pCutoff = MIN.PV, FCcutoff = MIN.FC, pointSize = 1.0, labSize = 2.0,
title = "Volcano Plot",
subtitle = contrast_name,
caption = paste0('log2 FC cutoff: ', MIN.FC, '; p-value cutoff: ',
MIN.PV, '\nTotal = ', nrow(top.res.table),
' markers [ ',UPDN[1],'UP, ',UPDN[2],'DOWN ]'),
legendPosition = 'bottom', legendLabSize = 14, legendIconSize = 5.0,
drawConnectors = TRUE, widthConnectors = 0.25,
colConnectors = 'grey30')}
volcano_plot_list <- lapply(1:length(top_table_list),
function(i) {
my_volcanoplot(top.res.table=top_table_list[[i]],
contrast_name=names(top_table_list)[[i]])
})
print(volcano_plot_list)
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
library("Mfuzz")
library("Biobase")
library("stringr")
We are going to use the normalized counts in CPMs saved as
logCPM (see Normalitzation secction). We collapse the
biological replicates to do this experiment because we normalized them
and we have seen that they are not too different.
logCPM_collapsed <- matrix(0, nrow = nrow(logCPM), ncol = length(levels(condition_factors)))
colnames(logCPM_collapsed) <- levels(condition_factors)
rownames(logCPM_collapsed) <- rownames(logCPM)
for (i in 1:length(levels(condition_factors))) {
names_before_underscore <- str_extract(colnames(logCPM), "^[^_]+")
unique_names <- unique(names_before_underscore)
cols_to_sum <- startsWith(names_before_underscore, unique_names[i])
logCPM_collapsed[, i] <- rowSums(logCPM[, cols_to_sum])
}
We save the data in a new object using ExpressionSet
normalizedSet <- ExpressionSet(assayData=logCPM_collapsed)
We filter any possible NA values. We also standardize the expression values of every peak so that the average expression value for each peak is zero and the standard deviation of its expression profile is one
embryo.r <- filter.NA(normalizedSet, thres=0.5)
## 0 genes excluded.
embryo.f <- fill.NA(embryo.r,mode="knnw")
embryo.s <- standardise(embryo.f)
We optimize the parameters for the MFuzz analysis. we will use two
parameters: - fuzzifier value (m): We will determine the optimal setting
of fuzzifie value using mestimate. This will be only an
estimation since the final result tuned depending on the performance of
the function. - number of clusters: final number of clusters. This will
be determined using repeated soft clustering for a range of cluster
numbers. This will report the minimum centroid distance. The minimum
centroid distance can be used as cluster validity index. For an optimal
cluster number, we may see a ‘drop’ of minimum centroid distance wh
plotted versus a range of cluster number and a slower decrease of the
minimum centroid distance for higher cluster number.
m_est <- mestimate(embryo.s)
m_est
## [1] 1.70376
m=1.5
plot_opt_cluster <- Dmin(embryo.f,m=m,crange=seq(5,40,5),repeats=30,visu=TRUE)
print(plot_opt_cluster)
## [1] 0.36083955 0.19994997 0.14980903 0.13156243 0.11949226 0.11002667 0.10342429
## [8] 0.09765822
We define the final parameters for the Mfuzz analysis
m=1.5
cluster=35
mf <- mfuzz(embryo.s,c=cluster,m=m)
mfuzz.plot(embryo.s,cl=mf, time.labels=c("emb4","emb6","emb8","emb10","emb12","emb14"),new.window=FALSE)
The results are stored in the mf object. Some
interesting metrics are: - centers: the final cluster centers. - size:
the number of data points in each cluster of the closest hard
clustering. - cluster: a vector of integers containing the indices of
the clusters where the data points are assigned to for the closest hard
clustering, as obtained by assigning points to the (first) class with
maximal membership. - membership: a matrix with the membership values of
the data points to the clusters.
We will make a data frame with the cluster assigned to each peak and add the annotation information. This will be useful for downstream analysis like GO or pathway enrichment.
mf_clusters <- data.frame(peak=names(mf$cluster), clusters = as.numeric(mf$cluster))
colnames(mf_clusters) <- c("peak_id", "cluster")
mf_clusters_annot <- merge(mf_clusters, peaks_annot, by.x="peak_id")